#!/usr/bin/env python
# encoding: utf-8
"""
#-------------------------------------------------------------------#
#                   CONFIDENTIAL --- CUSTOM STUDIOS                 #     
#-------------------------------------------------------------------#
#                                                                   #
#                   @Project Name : Globallawonline                #
#                                                                   #
#                   @File Name    : NoSqlPdfClean.py                      #
#                                                                   #
#                   @Programmer   : 李建                            #
#                                                                   #  
#                   @Start Date   : 2021/1/20 11:14                 #
#                                                                   #
#                   @Last Update  : 2021/1/20 11:14                 #
#                                                                   #
#-------------------------------------------------------------------#
# Classes:该类用于清理无Sql记录的Pdf文件                                                          #
#                                                                   #
#-------------------------------------------------------------------#
"""
import datetime
import os

import pymysql

from settings import CJSQLCON, FILES_DIR


class NoSqlPdfClean:
    TABLE = ['lawtext', 'lawcasetext', 'treaty']  # 这是数据库存储数据的表
    Tablepath = {'lawtext':'Law', 'lawcasetext':'case', 'treaty':'Treaty'}
    Country = ['LAWCOUNTRYXJP', 'LAWCOUNTRYFLB', 'LAWCOUNTRYWL', 'LAWCOUNTRYYDNXY', 'LAWCOUNTRYTG', 'LAWCOUNTRYMLXY',
               'LAWCOUNTRYYN', 'LAWCOUNTRYMD', 'LAWCOUNTRYLW', 'LAWCOUNTRYJPZ']
    Countrypath = {'LAWCOUNTRYXJP': 'Singapore', 'LAWCOUNTRYFLB': 'Philippines', 'LAWCOUNTRYWL': 'Brunei',
                   'LAWCOUNTRYYDNXY': 'Indonesia', 'LAWCOUNTRYTG': 'Thailand', 'LAWCOUNTRYMLXY': 'Malaysia',
                   'LAWCOUNTRYYN': 'Vietnam', 'LAWCOUNTRYMD': 'Myanmar', 'LAWCOUNTRYLW': 'Laos',
                   'LAWCOUNTRYJPZ': 'Cambodia'}
    def __init__(self):
        self.sqlconn = pymysql.connect(host=CJSQLCON['HOST'], port=CJSQLCON['PORT'], db=CJSQLCON['DB'], user=CJSQLCON['USER'], passwd=CJSQLCON['PASSWD'], charset='utf8')

    def close(self):
        self.sqlconn.close()

    def file_name(self, file_dir, table, country, urlcursor):
        """
        对传入路径下的pdf文件清洗无sql记录的文件
        :param file_dir: pdf文件路径
        :param table: Sql数据库表名
        :param country: 国家名
        :param urlcursor: 一个sql游标
        :return:
        """
        i = 0
        j = 0
        sys = []
        sql = "SELECT SYS_FLD_DIGITFILENAME FROM %s WHERE SortA = '%s'" % (table, country)
        try:
            urlcursor.execute(sql)
            rs = urlcursor.fetchall()
            for r in rs:
                file_sql = r[0]
                sys.append(file_sql)
        except Exception as e:
            print('%s：mysql查找数据异常：' % str(datetime.datetime.now()) + str(e))
        for root, dirs, files in os.walk(file_dir, topdown=False):
            for pdffile in files:
                j = j + 1
                print("\r已处理: %d" % j, end="")
                if len(pdffile):
                    if pdffile not in sys:
                        with open('nosql.txt', 'a', encoding='utf-8') as f:
                            i = i + 1
                            f.write(str(datetime.datetime.now()) + ":" + str(i) + '    ' + root + '\\' + pdffile + '\n')
                        pdfpath = root + '\\' + pdffile
                        if os.path.exists(pdfpath):
                            os.remove(pdfpath)

    def pdfclean(self):
        print(str(datetime.datetime.now()) + "：开始进行无sql记录的pdf文件清洗")
        urlcursor = self.sqlconn.cursor()
        for table in self.TABLE:
            for country in self.Country:
                tablepath = self.Tablepath[table]
                countrypath = self.Countrypath[country]
                path = r'%s/%s/%s' % (FILES_DIR, tablepath,countrypath)
                self.file_name(path,table,country,urlcursor)
        urlcursor.close()
        self.close()
        print(str(datetime.datetime.now()) + "：无sql记录的pdf文件清洗结束")
        return True

